home *** CD-ROM | disk | FTP | other *** search
- /*
- * Listing 12 -- model-2.c
- *
- * This module contains all of the modeling functions used with
- * comp-2.c and expand-2.c. This modeling unit keeps track of
- * all contexts from 0 up to max_order, which defaults to 3.
- * In addition, there is a special context -1 which is a fixed model
- * used to encode previously unseen characters, and a context -2
- * which is used to encode EOF and FLUSH messages.
- *
- * Each context is stored in a special CONTEXT structure, which is
- * documented below. Context tables are not created until the
- * context is seen, and they are never destroyed.
- *
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include "coder.h"
- #include "model.h"
-
- /*
- * This program consumes massive amounts of memory. One way to
- * handle large amounts of memory is to use Zortech's __handle
- * pointer type. So that my code will run with other compilers
- * as well, the handle stuff gets redefined when using other
- * compilers.
- */
- #ifdef __ZTC__
- #include <handle.h>
- #else
- #define __handle
- #define handle_calloc( a ) calloc( (a), 1 )
- #define handle_realloc( a, b ) realloc( (a), (b) )
- #define handle_free( a ) free( (a) )
- #endif
-
- /* A context table contains a list of the counts for all symbols
- * that have been seen in the defined context. For example, a
- * context of "Zor" might have only had 2 different characters
- * appear. 't' might have appeared 10 times, and 'l' might have
- * appeared once. These two counts are stored in the context
- * table. The counts are stored in the STATS structure. All of
- * the counts for a given context are stored in and array of STATS.
- * As new characters are added to a particular contexts, the STATS
- * array will grow. Sometimes, the STATS array will shrink
- * after flushing the model.
- */
- typedef struct {
- unsigned char symbol;
- unsigned char counts;
- } STATS;
-
- /*
- * Each context has to have links to higher order contexts. These
- * links are used to navigate through the context tables. For example,
- * to find the context table for "ABC", I start at the order 0 table,
- * then find the pointer to the "A" context table by looking through
- * then LINKS array. At that table, we find the "B" link and go to
- * that table. The process continues until the destination table is
- * found. The table pointed to by the LINKS array corresponds to the
- * symbol found at the same offset in the STATS table. The reason that
- * LINKS is in a separate structure instead of being combined with
- * STATS is to save space. All of the leaf context nodes don't need
- * next pointers, since they are in the highest order context. In the
- * leaf nodes, the LINKS array is a NULL pointers.
- */
- typedef struct {
- struct context *next;
- } LINKS;
-
- /*
- * The CONTEXT structure holds all of the know information about
- * a particular context. The links and stats pointers are discussed
- * immediately above here. The max_index element gives the maximum
- * index that can be applied to the stats or link array. When the
- * table is first created, and stats is set to NULL, max_index is set
- * to -1. As soon as single element is added to stats, max_index is
- * incremented to 0.
- *
- * The lesser context pointer is a navigational aid. It points to
- * the context that is one less than the current order. For example,
- * if the current context is "ABC", the lesser_context pointer will
- * point to "BC". The reason for maintaining this pointer is that
- * this particular bit of table searching is done frequently, but
- * the pointer only needs to be built once, when the context is
- * created.
- */
- typedef struct context {
- int max_index;
- LINKS __handle *links;
- STATS __handle *stats;
- struct context *lesser_context;
- } CONTEXT;
- /*
- * max_order is the maximum order that will be maintained by this
- * program. EXPAND-2 and COMP-2 both will modify this int based
- * on command line parameters.
- */
- int max_order=3;
- /*
- * *contexts[] is an array of current contexts. If I want to find
- * the order 0 context for the current state of the model, I just
- * look at contexts[0]. This array of context pointers is set up
- * every time the model is updated.
- */
- CONTEXT **contexts;
- /*
- * current_order contains the current order of the model. It starts
- * at max_order, and is decremented every time an ESCAPE is sent. It
- * will only go down to -1 for normal symbols, but can go to -2 for
- * EOF and FLUSH.
- */
- int current_order;
- /*
- * This variable tells COMP-2.C that the FLUSH symbol can be
- * sent using this model.
- */
- int flushing_enabled=1;
- /*
- * This table contains the cumulative totals for the current context.
- * Because this program is using exclusion, totals has to be calculated
- * every time a context is used. The scoreboard array keeps track of
- * symbols that have appeared in higher order models, so that they
- * can be excluded from lower order context total calculations.
- */
- short int totals[ 258 ];
- char scoreboard[ 256 ];
-
- /*
- * Local procedure declarations.
- */
- void error_exit( char *message );
- void update_table( CONTEXT *table, unsigned char symbol );
- void rescale_table( CONTEXT *table );
- void totalize_table( CONTEXT *table );
- CONTEXT *shift_to_next_context( CONTEXT *table, unsigned char c, int order);
- CONTEXT *allocate_next_order_table( CONTEXT *table,
- unsigned char symbol,
- CONTEXT *lesser_context );
-
- /*
- * This routine has to get everything set up properly so that
- * the model can be maintained properly. The first step is to create
- * the *contexts[] array used later to find current context tables.
- * The *contexts[] array indices go from -2 up to max_order, so
- * the table needs to be fiddled with a little. This routine then
- * has to create the special order -2 and order -1 tables by hand,
- * since they aren't quite like other tables. Then the current
- * context is set to \0, \0, \0, ... and the appropriate tables
- * are built to support that context. The current order is set
- * to max_order, the scoreboard is cleared, and the system is
- * ready to go.
- */
-
- void initialize_model()
- {
- int i;
- CONTEXT *null_table;
- CONTEXT *control_table;
-
- current_order = max_order;
- contexts = (CONTEXT **) calloc( sizeof( CONTEXT * ), 10 );
- if ( contexts == NULL )
- error_exit( "Failure #1: allocating context table!" );
- contexts += 2;
- null_table = (CONTEXT *) calloc( sizeof( CONTEXT ), 1 );
- if ( null_table == NULL )
- error_exit( "Failure #2: allocating null table!" );
- null_table->max_index = -1;
- contexts[ -1 ] = null_table;
- for ( i = 0 ; i <= max_order ; i++ )
- contexts[ i ] = allocate_next_order_table( contexts[ i-1 ],
- 0,
- contexts[ i-1 ] );
- handle_free( (char __handle *) null_table->stats );
- null_table->stats =
- (STATS __handle *) handle_calloc( sizeof( STATS ) * 256 );
- if ( null_table->stats == NULL )
- error_exit( "Failure #3: allocating null table!" );
- null_table->max_index = 255;
- for ( i=0 ; i < 256 ; i++ )
- {
- null_table->stats[ i ].symbol = (unsigned char) i;
- null_table->stats[ i ].counts = 1;
- }
-
- control_table = (CONTEXT *) calloc( sizeof(CONTEXT), 1 );
- if ( control_table == NULL )
- error_exit( "Failure #4: allocating null table!" );
- control_table->stats =
- (STATS __handle *) handle_calloc( sizeof( STATS ) * 2 );
- if ( control_table->stats == NULL )
- error_exit( "Failure #5: allocating null table!" );
- contexts[ -2 ] = control_table;
- control_table->max_index = 1;
- control_table->stats[ 0 ].symbol = -FLUSH;
- control_table->stats[ 0 ].counts = 1;
- control_table->stats[ 1 ].symbol =- DONE;
- control_table->stats[ 1 ].counts = 1;
-
- for ( i = 0 ; i < 256 ; i++ )
- scoreboard[ i ] = 0;
- }
- /*
- * This is a utility routine used to create new tables when a new
- * context is created. It gets a pointer to the current context,
- * and gets the symbol that needs to be added to it. It also needs
- * a pointer to the lesser context for the table that is to be
- * created. For example, if the current context was "ABC", and the
- * symbol 'D' was read in, add_character_to_model would need to
- * create the new context "BCD". This routine would get called
- * with a pointer to "BC", the symbol 'D', and a pointer to context
- * "CD". This routine then creates a new table for "BCD", adds it
- * to the link table for "BC", and gives "BCD" a back pointer to
- * "CD". Note that finding the lesser context is a difficult
- * task, and isn't done here. This routine mainly worries about
- * modifying the stats and links fields in the current context.
- */
-
- CONTEXT *allocate_next_order_table( CONTEXT *table,
- unsigned char symbol,
- CONTEXT *lesser_context )
- {
- CONTEXT *new_table;
- int i;
- unsigned int new_size;
-
- for ( i = 0 ; i <= table->max_index ; i++ )
- if ( table->stats[ i ].symbol == symbol )
- break;
- if ( i > table->max_index )
- {
- table->max_index++;
- new_size = sizeof( LINKS );
- new_size *= table->max_index + 1;
- if ( table->links == NULL )
- table->links = (LINKS __handle *) handle_calloc( new_size );
- else
- table->links = (LINKS __handle *)
- handle_realloc( (char __handle *) table->links, new_size );
- new_size = sizeof( STATS );
- new_size *= table->max_index + 1;
- if ( table->stats == NULL )
- table->stats = (STATS __handle *) handle_calloc( new_size );
- else
- table->stats = (STATS __handle *)
- handle_realloc( (char __handle *) table->stats, new_size );
- if ( table->links == NULL )
- error_exit( "Failure #6: allocating new table" );
- if ( table->stats == NULL )
- error_exit( "Failure #7: allocating new table" );
- table->stats[ i ].symbol = symbol;
- table->stats[ i ].counts = 0;
- }
- new_table = (CONTEXT *) calloc( sizeof( CONTEXT ), 1 );
- if ( new_table == NULL )
- error_exit( "Failure #8: allocating new table" );
- new_table->max_index = -1;
- table->links[ i ].next = new_table;
- new_table->lesser_context = lesser_context;
- return( new_table );
- }
-
- /*
- * This routine is called to increment the counts for the current
- * contexts. It is called after a character has been encoded or
- * decoded. All it does is call update_table for each of the
- * current contexts, which does the work of incrementing the count.
- * This particular version of update_model() practices update exclusion,
- * which means that if lower order models weren't used to encode
- * or decode the character, they don't get their counts updated.
- * This seems to improve compression performance quite a bit.
- * To disable update exclusion, the loop would be changed to run
- * from 0 to max_order, instead of current_order to max_order.
- */
- void update_model( int symbol )
- {
- int i;
- int local_order;
-
- if ( current_order < 0 )
- local_order = 0;
- else
- local_order = current_order;
- if ( symbol >= 0 )
- {
- while ( local_order <= max_order )
- {
- if ( symbol >= 0 )
- update_table( contexts[ local_order ], (unsigned char) symbol );
- local_order++;
- }
- }
- current_order = max_order;
- for ( i = 0 ; i < 256 ; i++ )
- scoreboard[ i ] = 0;
- }
- /*
- * This routine is called to update the count for a particular symbol
- * in a particular table. The table is one of the current contexts,
- * and the symbol is the last symbol encoded or decoded. In principle
- * this is a fairly simple routine, but a couple of complications make
- * things a little messier. First of all, the given table may not
- * already have the symbol defined in its statistics table. If it
- * doesn't, the stats table has to grow and have the new guy added
- * to it. Secondly, the symbols are kept in sorted order by count
- * in the table so as that the table can be trimmed during the flush
- * operation. When this symbol is incremented, it might have to be moved
- * up to reflect its new rank. Finally, since the counters are only
- * bytes, if the count reaches 255, the table absolutely must be rescaled
- * to get the counts back down to a reasonable level.
- */
- void update_table( CONTEXT *table, unsigned char symbol )
- {
- int i;
- int index;
- unsigned char temp;
- CONTEXT *temp_ptr;
- unsigned int new_size;
- /*
- * First, find the symbol in the appropriate context table. The first
- * symbol in the table is the most active, so start there.
- */
- index = 0;
- while ( index <= table->max_index &&
- table->stats[index].symbol != symbol )
- index++;
- if ( index > table->max_index )
- {
- table->max_index++;
- new_size = sizeof( LINKS );
- new_size *= table->max_index + 1;
- if ( current_order < max_order )
- {
- if ( table->max_index == 0 )
- table->links = (LINKS __handle *) handle_calloc( new_size );
- else
- table->links = (LINKS __handle *)
- handle_realloc( (char __handle *) table->links, new_size );
- if ( table->links == NULL )
- error_exit( "Error #9: reallocating table space!" );
- table->links[ index ].next = NULL;
- }
- new_size = sizeof( STATS );
- new_size *= table->max_index + 1;
- if (table->max_index==0)
- table->stats = (STATS __handle *) handle_calloc( new_size );
- else
- table->stats = (STATS __handle *)
- handle_realloc( (char __handle *) table->stats, new_size );
- if ( table->stats == NULL )
- error_exit( "Error #10: reallocating table space!" );
- table->stats[ index ].symbol = symbol;
- table->stats[ index ].counts = 0;
- }
- /*
- * Now I move the symbol to the front of its list.
- */
- i = index;
- while ( i > 0 &&
- table->stats[ index ].counts == table->stats[ i-1 ].counts )
- i--;
- if ( i != index )
- {
- temp = table->stats[ index ].symbol;
- table->stats[ index ].symbol = table->stats[ i ].symbol;
- table->stats[ i ].symbol = temp;
- if ( table->links != NULL )
- {
- temp_ptr = table->links[ index ].next;
- table->links[ index ].next = table->links[ i ].next;
- table->links[ i ].next = temp_ptr;
- }
- index = i;
- }
- /*
- * The switch has been performed, now I can update the counts
- */
- table->stats[ index ].counts++;
- if ( table->stats[ index ].counts == 255 )
- rescale_table( table );
- }
-
- /*
- * This routine is called when a given symbol needs to be encoded.
- * It is the job of this routine to find the symbol in the context
- * table associated with the current table, and return the low and
- * high counts associated with that symbol, as well as the scale.
- * Finding the table is simple. Unfortunately, once I find the table,
- * I have to build the table of cumulative counts, which is
- * expensive, and is done elsewhere. If the symbol is found in the
- * table, the appropriate counts are returned. If the symbols is
- * not found, the ESCAPE symbol probabilities are returned, and
- * the current order is reduced. Note also the kludge to support
- * the order -2 character set, which consists of negative numbers
- * instead of unsigned chars. This insures that no match will every
- * be found for the EOF or FLUSH symbols in any "normal" table.
- */
- int convert_int_to_symbol( int c, SYMBOL *s )
- {
- int i;
- CONTEXT *table;
-
- table = contexts[ current_order ];
- totalize_table( table );
- s->scale = totals[ 0 ];
- if ( current_order == -2 )
- c = -c;
- for ( i = 0 ; i <= table->max_index ; i++ )
- {
- if ( c == (int) table->stats[ i ].symbol )
- {
- if ( table->stats[ i ].counts == 0 )
- break;
- s->low_count = totals[ i+2 ];
- s->high_count = totals[ i+1 ];
- return( 0 );
- }
- }
- s->low_count = totals[ 1 ];
- s->high_count = totals[ 0 ];
- current_order--;
- return( 1 );
- }
- /*
- * This routine is called when decoding an arithmetic number. In
- * order to decode the present symbol, the current scale in the
- * model must be determined. This requires looking up the current
- * table, then building the totals table. Once that is done, the
- * cumulative total table has the symbol scale at element 0.
- */
- void get_symbol_scale( SYMBOL *s )
- {
- CONTEXT *table;
-
- table = contexts[ current_order ];
- totalize_table( table );
- s->scale = totals[ 0 ];
- }
-
- /*
- * This routine is called during decoding. It is given a count that
- * came out of the arithmetic decoder, and has to find the symbol that
- * matches the count. The cumulative totals are already stored in the
- * totals[] table, form the call to get_symbol_scale, so this routine
- * just has to look through that table. Once the match is found,
- * the appropriate character is returned to the caller. Two possible
- * complications. First, the character might be the ESCAPE character,
- * in which case the current_order has to be decremented. The other
- * complication is that the order might be -2, in which case we return
- * the negative of the symbol so it isn't confused with a normal
- * symbol.
- */
- int convert_symbol_to_int( int count, SYMBOL *s)
- {
- int c;
- CONTEXT *table;
-
- table = contexts[ current_order ];
- for ( c = 0; count < totals[ c ] ; c++ )
- ;
- s->high_count = totals[ c-1 ];
- s->low_count = totals[ c ];
- if ( c == 1 )
- {
- current_order--;
- return( ESCAPE );
- }
- if ( current_order < -1 )
- return( (int) -table->stats[ c-2 ].symbol );
- else
- return( table->stats[ c-2 ].symbol );
- }
-
-
- /*
- * After the model has been updated for a new character, this routine
- * is called to "shift" into the new context. For example, if the
- * last context was "ABC", and the symbol 'D' had just been processed,
- * this routine would want to update the context pointers to that
- * contexts[1]=="D", contexts[2]=="CD" and contexts[3]=="BCD". The
- * potential problem is that some of these tables may not exist.
- * The way this is handled is by the shift_to_next_context routine.
- * It is passed a pointer to the "ABC" context, along with the symbol
- * 'D', and its job is to return a pointer to "BCD". Once we have
- * "BCD", we can follow the lesser context pointers in order to get
- * the pointers to "CD" and "C". The hard work was done in
- * shift_to_context().
- */
- void add_character_to_model( int c )
- {
- int i;
- if ( max_order < 0 || c < 0 )
- return;
- contexts[ max_order ] =
- shift_to_next_context( contexts[ max_order ],
- (unsigned char) c, max_order );
- for ( i = max_order-1 ; i > 0 ; i-- )
- contexts[ i ] = contexts[ i+1 ]->lesser_context;
- }
-
- /*
- * This routine is called when adding a new character to the model. From
- * the previous example, if the current context was "ABC", and the new
- * symbol was 'D', this routine would get called with a pointer to
- * context table "ABC", and symbol 'D', with order max_order. What this
- * routine needs to do then is to find the context table "BCD". This
- * should be an easy job, and it is if the table already exists. All
- * we have to in that case is follow the back pointer from "ABC" to "BC".
- * We then search the link table of "BC" until we find the linke to "D".
- * That link points to "BCD", and that value is then returned to the
- * caller. The problem crops up when "BC" doesn't have a pointer to
- * "BCD". This generally means that the "BCD" context has not appeared
- * yet. When this happens, it means a new table has to be created and
- * added to the "BC" table. That can be done with a single call to
- * the allocate_new_table routine. The only problem is that the
- * allocate_new_table routine wants to know what the lesser context for
- * the new table is going to be. In other words, when I create "BCD",
- * I need to know where "CD" is located. In order to find "CD", I
- * have to recursively call shift_to_next_context, passing it a pointer
- * to context "C" and they symbol 'D'. It then returns a pointer to
- * "CD", which I use to create the "BCD" table. The recursion is guaranteed
- * to end if it ever gets to order -1, because the null table is
- * guaranteed to have a for every symbol to the order 0 table. This is
- * the most complicated part of the modeling program, but it is
- * necessary for performance reasons.
- */
- CONTEXT *shift_to_next_context( CONTEXT *table, unsigned char c, int order)
- {
- int i;
- CONTEXT *new_lesser;
- /*
- * First, try to find the new context by backing up to the lesser
- * context and searching its link table. If I find the link, we take
- * a quick and easy exit, returning the link. Note that their is a
- * special Kludge for context order 0. We know for a fact that
- * the lesser context pointer at order 0 points to the null table,
- * order -1, and we know that the -1 table only has a single link
- * pointer, which points back to the order 0 table.
- */
- table = table->lesser_context;
- if ( order == 0 )
- return( table->links[ 0 ].next );
- for ( i = 0 ; i <= table->max_index ; i++ )
- if ( table->stats[ i ].symbol == c )
- if ( table->links[ i ].next != NULL )
- return( table->links[ i ].next );
- else
- break;
- /*
- * If I get here, it means the new context did not exist. I have to
- * create the new context, add a link to it here, and add the backwards
- * link to *his* previous context. Creating the table and adding it to
- * this table is pretty easy, but adding the back pointer isn't. Since
- * creating the new back pointer isn't easy, I duck my responsibility
- * and recurse to myself in order to pick it up.
- */
- new_lesser = shift_to_next_context( table, c, order-1 );
- /*
- * Now that I have the back pointer for this table, I can make a call
- * to a utility to allocate the new table.
- */
- table = allocate_next_order_table( table, c, new_lesser );
- return( table );
- }
-
- /*
- * Rescaling the table needs to be done for one of three reasons.
- * First, if the maximum count for the table has exceeded 16383, it
- * means that arithmetic coding using 16 and 32 bit registers might
- * no longer work. Secondly, if an individual symbol count has
- * reached 255, it will no longer fit in a byte. Third, if the
- * current model isn't compressing well, the compressor program may
- * want to rescale all tables in order to give more weight to newer
- * statistics. All this routine does is divide each count by 2.
- * If any counts drop to 0, the counters can be removed from the
- * stats table, but only if this is a leaf context. Otherwise, we
- * might cut a link to a higher order table.
- */
- void rescale_table( CONTEXT *table )
- {
- int i;
-
- if ( table->max_index == -1 )
- return;
- for ( i = 0 ; i <= table->max_index ; i++ )
- table->stats[ i ].counts /= 2;
- if ( table->stats[ table->max_index ].counts == 0 &&
- table->links == NULL )
- {
- while ( table->stats[ table->max_index ].counts == 0 &&
- table->max_index >= 0 )
- table->max_index--;
- if ( table->max_index == -1 )
- {
- handle_free( (char __handle *) table->stats );
- table->stats = NULL;
- }
- else
- {
- table->stats = (STATS __handle *)
- handle_realloc( (char __handle *) table->stats,
- sizeof( STATS ) * ( table->max_index + 1 ) );
- if ( table->stats == NULL )
- error_exit( "Error #11: reallocating stats space!" );
- }
- }
- }
-
- /*
- * This routine has the job of creating a cumulative totals table for
- * a given context. The cumulative low and high for symbol c are going to
- * be stored in totals[c+2] and totals[c+1]. Locations 0 and 1 are
- * reserved for the special ESCAPE symbol. The ESCAPE symbol
- * count is calculated dynamically, and changes based on what the
- * current context looks like. Note also that this routine ignores
- * any counts for symbols that have already showed up in the scoreboard,
- * and it adds all new symbols found here to the scoreboard. This
- * allows us to exclude counts of symbols that have already appeared in
- * higher order contexts, improving compression quite a bit.
- */
- void totalize_table( CONTEXT *table )
- {
- int i;
- unsigned char max;
-
- for ( ; ; )
- {
- max = 0;
- i = table->max_index + 2;
- totals[ i ] = 0;
- for ( ; i > 1 ; i-- )
- {
- totals[ i-1 ] = totals[ i ];
- if ( table->stats[ i-2 ].counts )
- if ( ( current_order == -2 ) ||
- scoreboard[ table->stats[ i-2 ].symbol ] == 0 )
- totals[ i-1 ] += table->stats[ i-2 ].counts;
- if ( table->stats[ i-2 ].counts > max )
- max = table->stats[ i-2 ].counts;
- }
- /*
- * Here is where the escape calculation needs to take place.
- */
- if ( max == 0 )
- totals[ 0 ] = 1;
- else
- {
- totals[ 0 ] = (short int) ( 256 - table->max_index );
- totals[ 0 ] *= table->max_index;
- totals[ 0 ] /= 256;
- totals[ 0 ] /= max;
- totals[ 0 ]++;
- totals[ 0 ] += totals[ 1 ];
- }
- if ( totals[ 0 ] < MAXIMUM_SCALE )
- break;
- rescale_table( table );
- }
- for ( i = 0 ; i < table->max_index ; i++ )
- if (table->stats[i].counts != 0)
- scoreboard[ table->stats[ i ].symbol ] = 1;
- }
-
- /*
- * This routine is called when the entire model is to be flushed.
- * This is done in an attempt to improve the compression ratio by
- * giving greater weight to upcoming statistics. This routine
- * starts at the given table, and recursively calls itself to
- * rescale every table in its list of links. The table itself
- * is then rescaled.
- */
- void recursive_flush( CONTEXT *table )
- {
- int i;
-
- if ( table->links != NULL )
- for ( i = 0 ; i <= table->max_index ; i++ )
- if ( table->links[ i ].next != NULL )
- recursive_flush( table->links[ i ].next );
- rescale_table( table );
- }
-
- /*
- * This routine is called to flush the whole table, which it does
- * by calling the recursive flush routine starting at the order 0
- * table.
- */
- void flush_model()
- {
- recursive_flush( contexts[ 0 ] );
- }
-
- void error_exit( char *message)
- {
- putc( '\n', stdout );
- puts( message );
- exit( -1 );
- }
-
-
-